import pandas as pd
import plotly.express as px
import math
data = pd.read_pickle('data/train.pkl')
changes_by_type = data.groupby(['bot', 'type']).size().reset_index().rename(columns={0: 'count'})
px.bar(changes_by_type, x='type', y='count', color='bot',
title='Distribution of changes by type and user category',
width=600, height=500,
labels={'bot': 'User is bot', 'type': 'Change type', 'count': 'Count of changes'},
color_discrete_sequence=px.colors.qualitative.Pastel)
data['bot_in_username'] = data['user'].str.contains('bot', case=False).astype(int)
data['bot_in_username'].mask(data['bot_in_username'] == 1, "Have 'bot'", inplace=True)
data['bot_in_username'].mask(data['bot_in_username'] == 0, "Don't have 'bot'", inplace=True)
bot_in_username = data.groupby(['bot', 'bot_in_username']).size().reset_index().rename(columns={0: 'count'})
px.bar(bot_in_username, x='bot_in_username', y='count', color='bot',
title="Distribution by presence of 'bot' in the name and category",
width=600, height=500,
labels={'bot': 'User is bot', 'bot_in_username': "'bot' is in the user name", 'count': 'Count of changes'},
color_discrete_sequence=px.colors.qualitative.Pastel)
data['comment_len'] = data['comment'].str.len()
data['comment_len_in50'] = (data['comment_len'] / 50).apply(math.ceil) * 50
# cut off extremes for legible visual
px.histogram(data[data['comment_len'] <= 300], x='comment_len', color='bot',
title="Distribution by comment lengths and user category",
width=600, height=500,
labels={'bot': 'User is bot', 'comment_len': "Comment length", 'count': 'Count of changes'},
color_discrete_sequence=px.colors.qualitative.Pastel)
data['revision_len'] = data['length.new'] - data['length.old']
data['revision_len'] = data['revision_len'].fillna(0)
data['revision_len_in10000'] = (data['revision_len'] / 100).apply(math.ceil) * 100
# cut off extremes
px.histogram((data[(data['revision_len']<7000) & (data['revision_len']>-7000)]), x='revision_len', color='bot',
title="Distribution by revision lengths and user category",
width=600, height=500,
labels={'bot': 'User is bot', 'revision_len_in10000': "Revision length <", 'count': 'Count of changes'},
color_discrete_sequence=px.colors.qualitative.Pastel)
data['datetime'] = pd.to_datetime(data['timestamp'], unit='s')
# avg changes by user per minute
changes_per_min = data.groupby(['user', pd.Grouper(key='datetime', freq='min')]).size().reset_index()\
.rename(columns={0:'changes_per_min', 'datetime': 'minute'})
avg_changes_per_min = changes_per_min.groupby('user').agg(avg_changes_per_min=('changes_per_min', 'mean')).reset_index()
data = pd.merge(data, avg_changes_per_min, how='left', on='user')
data['avg_changes_per_min_in10'] = (data['avg_changes_per_min'] / 10).apply(math.ceil) * 10
avg_changes_per_min = data.groupby(['bot', 'avg_changes_per_min_in10']).size().reset_index().rename(columns={0: 'count'})
px.histogram(data, x='avg_changes_per_min', color='bot',
title="Distribution by number of changes per minute and user category",
width=600, height=500,
labels={'bot': 'User is bot', 'avg_changes_per_min': "Average number of changes per minute", 'count': 'Count of changes'},
color_discrete_sequence=px.colors.qualitative.Pastel)
# avg changes by user per 2 seconds
changes_per_2s = data.groupby(['user', pd.Grouper(key='datetime', freq='2S')]).size().reset_index()\
.rename(columns={0:'changes_per_2s', 'datetime': 'second'})
avg_changes_per_2s = changes_per_2s.groupby('user').agg(avg_changes_per_2s=('changes_per_2s', 'mean')).reset_index()
data = pd.merge(data, avg_changes_per_2s, how='left', on='user')
data['avg_changes_per_2s'] = data['avg_changes_per_2s'].apply(math.ceil)
# cut off extremes
px.histogram(data[data['avg_changes_per_2s']<=10], x='avg_changes_per_2s', color='bot',
title="Distribution by number of changes per 2 sec and user category",
width=600, height=500,
labels={'bot': 'User is bot', 'avg_changes_per_2s': "Average number of changes per 2 seconds <", 'count': 'Count of changes'},
color_discrete_sequence=px.colors.qualitative.Pastel)